library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(DT)
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
library(knitr)
## Warning: package 'knitr' was built under R version 3.6.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(correlationfunnel)
## Warning: package 'correlationfunnel' was built under R version 3.6.3
## == correlationfunnel Tip #1 ============================================================
## Make sure your data is not overly imbalanced prior to using `correlate()`.
## If less than 5% imbalance, consider sampling. :)
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.3
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(e1071)
## Warning: package 'e1071' was built under R version 3.6.3
Train<-read.csv("pml-training.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
Test<-read.csv("pml-testing.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dim(Train); 
## [1] 19622   160
dim(Test)
## [1]  20 160
train_split <- createDataPartition(Train$classe, p = 0.8, list = F)
Val_data <- Train[-train_split,]
Train <- Train[train_split,]
dim(Train); 
## [1] 15699   160
dim(Val_data)
## [1] 3923  160
table(Train$classe)/nrow(Train)
## 
##         A         B         C         D         E 
## 0.2843493 0.1935155 0.1744060 0.1638958 0.1838334
mb <- sapply(select(Train,names(Train)[grepl("_belt",names(Train))]),function(x) sum(is.na(x)))
mb
##            roll_belt           pitch_belt             yaw_belt 
##                    0                    0                    0 
##     total_accel_belt   kurtosis_roll_belt  kurtosis_picth_belt 
##                    0                15385                15407 
##    kurtosis_yaw_belt   skewness_roll_belt skewness_roll_belt.1 
##                15699                15385                15407 
##    skewness_yaw_belt        max_roll_belt       max_picth_belt 
##                15699                15379                15379 
##         max_yaw_belt        min_roll_belt       min_pitch_belt 
##                15385                15379                15379 
##         min_yaw_belt  amplitude_roll_belt amplitude_pitch_belt 
##                15385                15379                15379 
##   amplitude_yaw_belt var_total_accel_belt        avg_roll_belt 
##                15385                15379                15379 
##     stddev_roll_belt        var_roll_belt       avg_pitch_belt 
##                15379                15379                15379 
##    stddev_pitch_belt       var_pitch_belt         avg_yaw_belt 
##                15379                15379                15379 
##      stddev_yaw_belt         var_yaw_belt         gyros_belt_x 
##                15379                15379                    0 
##         gyros_belt_y         gyros_belt_z         accel_belt_x 
##                    0                    0                    0 
##         accel_belt_y         accel_belt_z        magnet_belt_x 
##                    0                    0                    0 
##        magnet_belt_y        magnet_belt_z 
##                    0                    0
ma <- sapply(select(Train,names(Train)[grepl("_arm",names(Train))]),function(x) sum(is.na(x)))
ma
##            roll_arm           pitch_arm             yaw_arm     total_accel_arm 
##                   0                   0                   0                   0 
##       var_accel_arm        avg_roll_arm     stddev_roll_arm        var_roll_arm 
##               15379               15379               15379               15379 
##       avg_pitch_arm    stddev_pitch_arm       var_pitch_arm         avg_yaw_arm 
##               15379               15379               15379               15379 
##      stddev_yaw_arm         var_yaw_arm         gyros_arm_x         gyros_arm_y 
##               15379               15379                   0                   0 
##         gyros_arm_z         accel_arm_x         accel_arm_y         accel_arm_z 
##                   0                   0                   0                   0 
##        magnet_arm_x        magnet_arm_y        magnet_arm_z   kurtosis_roll_arm 
##                   0                   0                   0               15438 
##  kurtosis_picth_arm    kurtosis_yaw_arm   skewness_roll_arm  skewness_pitch_arm 
##               15440               15386               15438               15440 
##    skewness_yaw_arm        max_roll_arm       max_picth_arm         max_yaw_arm 
##               15386               15379               15379               15379 
##        min_roll_arm       min_pitch_arm         min_yaw_arm  amplitude_roll_arm 
##               15379               15379               15379               15379 
## amplitude_pitch_arm   amplitude_yaw_arm 
##               15379               15379
mf <- sapply(select(Train,names(Train)[grepl("_forearm",names(Train))]),function(x) sum(is.na(x)))
mf
##            roll_forearm           pitch_forearm             yaw_forearm 
##                       0                       0                       0 
##   kurtosis_roll_forearm  kurtosis_picth_forearm    kurtosis_yaw_forearm 
##                   15443                   15444                   15699 
##   skewness_roll_forearm  skewness_pitch_forearm    skewness_yaw_forearm 
##                   15443                   15444                   15699 
##        max_roll_forearm       max_picth_forearm         max_yaw_forearm 
##                   15379                   15379                   15443 
##        min_roll_forearm       min_pitch_forearm         min_yaw_forearm 
##                   15379                   15379                   15443 
##  amplitude_roll_forearm amplitude_pitch_forearm   amplitude_yaw_forearm 
##                   15379                   15379                   15443 
##     total_accel_forearm       var_accel_forearm        avg_roll_forearm 
##                       0                   15379                   15379 
##     stddev_roll_forearm        var_roll_forearm       avg_pitch_forearm 
##                   15379                   15379                   15379 
##    stddev_pitch_forearm       var_pitch_forearm         avg_yaw_forearm 
##                   15379                   15379                   15379 
##      stddev_yaw_forearm         var_yaw_forearm         gyros_forearm_x 
##                   15379                   15379                       0 
##         gyros_forearm_y         gyros_forearm_z         accel_forearm_x 
##                       0                       0                       0 
##         accel_forearm_y         accel_forearm_z        magnet_forearm_x 
##                       0                       0                       0 
##        magnet_forearm_y        magnet_forearm_z 
##                       0                       0
md <- sapply(select(Train,names(Train)[grepl("_dumbbell",names(Train))]),function(x) sum(is.na(x)))
md
##            roll_dumbbell           pitch_dumbbell             yaw_dumbbell 
##                        0                        0                        0 
##   kurtosis_roll_dumbbell  kurtosis_picth_dumbbell    kurtosis_yaw_dumbbell 
##                    15382                    15379                    15699 
##   skewness_roll_dumbbell  skewness_pitch_dumbbell    skewness_yaw_dumbbell 
##                    15382                    15379                    15699 
##        max_roll_dumbbell       max_picth_dumbbell         max_yaw_dumbbell 
##                    15379                    15379                    15382 
##        min_roll_dumbbell       min_pitch_dumbbell         min_yaw_dumbbell 
##                    15379                    15379                    15382 
##  amplitude_roll_dumbbell amplitude_pitch_dumbbell   amplitude_yaw_dumbbell 
##                    15379                    15379                    15382 
##     total_accel_dumbbell       var_accel_dumbbell        avg_roll_dumbbell 
##                        0                    15379                    15379 
##     stddev_roll_dumbbell        var_roll_dumbbell       avg_pitch_dumbbell 
##                    15379                    15379                    15379 
##    stddev_pitch_dumbbell       var_pitch_dumbbell         avg_yaw_dumbbell 
##                    15379                    15379                    15379 
##      stddev_yaw_dumbbell         var_yaw_dumbbell         gyros_dumbbell_x 
##                    15379                    15379                        0 
##         gyros_dumbbell_y         gyros_dumbbell_z         accel_dumbbell_x 
##                        0                        0                        0 
##         accel_dumbbell_y         accel_dumbbell_z        magnet_dumbbell_x 
##                        0                        0                        0 
##        magnet_dumbbell_y        magnet_dumbbell_z 
##                        0                        0
dc <- c(names(mb[mb!=0]), names(ma[ma!=0]),names(mf[mf!=0]),names(md[md!=0]))
length(dc)
## [1] 100
data_an<-tbl_df(Train%>%select(-dc,-c(X,user_name,raw_timestamp_part_1,raw_timestamp_part_2,cvtd_timestamp,new_window,num_window)))
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(dc)` instead of `dc` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data_an$classe<-as.factor(data_an$classe)
data_an[,1:52]<-lapply(data_an[,1:52],as.numeric)
dim(data_an)
## [1] 15699    53
c_c <- cor(select(data_an, -classe))
diag(c_c) <- 0
c_c <- which(abs(c_c)>0.8,arr.ind = T)
c_c <- unique(row.names(c_c))
corrplot(cor(select(data_an,c_c)),type="upper",order="hclust",method="number")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(c_c)` instead of `c_c` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

cfd <-data_an%>%binarize(n_bins=4,thresh_infreq=0.01)
ca <- cfd %>% correlate(target=classe__A) 
cb<-cfd%>%correlate(target=classe__B)
cc <- cfd%>%correlate(target=classe__C)
cd<-cfd%>%correlate(target=classe__D)
ce<-cfd %>% correlate(target = classe__E)
a_col <- c("magnet_arm_x", "pitch_forearm" , "magnet_dumbbell_y", "roll_forearm", "gyros_dumbbell_y") 
b_col <- c("magnet_dumbbell_y", "magnet_dumbbell_x" , "roll_dumbbell" , 
           "magnet_belt_y" , "accel_dumbbell_x" )
c_col <- c("magnet_dumbbell_y", "roll_dumbbell" , "accel_dumbbell_y" , 
           "magnet_dumbbell_x", "magnet_dumbbell_z")
d_col <- c("pitch_forearm" , "magnet_arm_y" , "magnet_forearm_x",
           "accel_dumbbell_y", "accel_forearm_x")
e_col <- c("magnet_belt_y" , "magnet_belt_z" , "roll_belt", 
           "gyros_belt_z" , "magnet_dumbbell_y")
fc <- character()
for(c in c(a_col,b_col,c_col,d_col,e_col)){
  fc <- union(fc,c)
}
data_an2 <- data_an%>%select(fc,classe)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(fc)` instead of `fc` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data.frame("arm" = sum(grepl("_arm",fc)),"forearm"=sum(grepl("_forearm",fc)),"belt"=sum(grepl("_belt",fc)),"dumbbell"=sum(grepl("_dumbbell",fc)))
##   arm forearm belt dumbbell
## 1   2       4    4        7
m_d<-function(data, mapping, ...) {
  ggplot(data = data, mapping=mapping)+geom_density(..., alpha = 0.3)+scale_fill_brewer(palette="Set2") 
}
m_p<-function(data, mapping, ...) {
  ggplot(data = data, mapping=mapping)+geom_point(..., alpha = 0.1)+ scale_fill_brewer(palette="Set2") 
}
ggpairs(data_an2,columns = 1:5,aes(color = classe),lower = list(continuous = m_p),diag = list(continuous = m_d))

ggpairs(data_an2,columns=6:10,aes(color=classe),lower=list(continuous=m_p),diag =list(continuous=m_d))

ggpairs(data_an2,columns = 11:17,aes(color = classe),lower=list(continuous=m_p),diag=list(continuous=m_d))

TrainF <- Train %>% select(fc,classe)
Val_dataF<-Val_data %>% select(fc,classe)
TrainF[,1:17] <- sapply(TrainF[,1:17],as.numeric)
Val_dataF[,1:17] <- sapply(Val_dataF[,1:17],as.numeric)
lvl<-c("A", "B", "C", "D", "E")
ppo <- preProcess(TrainF[,-18],method = c("center","scale","BoxCox"))
Tr_x <- predict(ppo,select(TrainF,-classe))
Tr_y <- factor(TrainF$classe,levels=lvl)
V_x <- predict(ppo,select(Val_dataF,-classe))
V_y<- factor(Val_dataF$classe,levels=lvl)
C_tr <- trainControl(method="cv", number=5)
CT_m <- train(x = Tr_x,y = Tr_y,method = "rpart", trControl = C_tr)
RF_m <- train(x = Tr_x,y = Tr_y, method = "rf", trControl = C_tr,verbose=FALSE, metric = "Accuracy")
GBM_m <- train(x = Tr_x,y = Tr_y,method = "gbm",trControl=C_tr,verbose=FALSE)
SVM_m <- svm(x = Tr_x,y = Tr_y,kernel = "polynomial", cost = 10)
confusionMatrix(predict(CT_m,V_x),V_y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1011  327  321  302  106
##          B   14  263   29  110   94
##          C   89  169  334  231  212
##          D    0    0    0    0    0
##          E    2    0    0    0  309
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4887          
##                  95% CI : (0.4729, 0.5044)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.331           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9059  0.34651  0.48830   0.0000  0.42857
## Specificity            0.6238  0.92193  0.78358   1.0000  0.99938
## Pos Pred Value         0.4891  0.51569  0.32271      NaN  0.99357
## Neg Pred Value         0.9434  0.85467  0.87881   0.8361  0.88594
## Prevalence             0.2845  0.19347  0.17436   0.1639  0.18379
## Detection Rate         0.2577  0.06704  0.08514   0.0000  0.07877
## Detection Prevalence   0.5269  0.13000  0.26383   0.0000  0.07928
## Balanced Accuracy      0.7649  0.63422  0.63594   0.5000  0.71397
confusionMatrix(predict(RF_m,V_x),V_y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1114    9    0    6    0
##          B    0  739    6    1    1
##          C    1    9  672    9    1
##          D    0    2    6  627    2
##          E    1    0    0    0  717
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9862          
##                  95% CI : (0.9821, 0.9896)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9826          
##                                           
##  Mcnemar's Test P-Value : 0.01261         
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9982   0.9736   0.9825   0.9751   0.9945
## Specificity            0.9947   0.9975   0.9938   0.9970   0.9997
## Pos Pred Value         0.9867   0.9893   0.9711   0.9843   0.9986
## Neg Pred Value         0.9993   0.9937   0.9963   0.9951   0.9988
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2840   0.1884   0.1713   0.1598   0.1828
## Detection Prevalence   0.2878   0.1904   0.1764   0.1624   0.1830
## Balanced Accuracy      0.9964   0.9856   0.9881   0.9860   0.9971
plot(RF_m$finalModel,main="Error VS no of tree")

confusionMatrix(predict(GBM_m,V_x),V_y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1100   38    5    7    3
##          B    8  628   42    8   11
##          C    3   59  614   39   11
##          D    3   31   21  586   13
##          E    2    3    2    3  683
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9205          
##                  95% CI : (0.9116, 0.9287)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8993          
##                                           
##  Mcnemar's Test P-Value : 2.621e-09       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9857   0.8274   0.8977   0.9114   0.9473
## Specificity            0.9811   0.9782   0.9654   0.9793   0.9969
## Pos Pred Value         0.9540   0.9010   0.8457   0.8960   0.9856
## Neg Pred Value         0.9942   0.9594   0.9781   0.9826   0.9882
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2804   0.1601   0.1565   0.1494   0.1741
## Detection Prevalence   0.2939   0.1777   0.1851   0.1667   0.1767
## Balanced Accuracy      0.9834   0.9028   0.9315   0.9453   0.9721
confusionMatrix(predict(SVM_m,V_x),V_y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1111   57   26   20    3
##          B    1  653   25    5    3
##          C    2   40  623   41    5
##          D    2    5    7  574   10
##          E    0    4    3    3  700
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9332          
##                  95% CI : (0.9249, 0.9408)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9152          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9955   0.8603   0.9108   0.8927   0.9709
## Specificity            0.9622   0.9893   0.9728   0.9927   0.9969
## Pos Pred Value         0.9129   0.9505   0.8762   0.9599   0.9859
## Neg Pred Value         0.9982   0.9672   0.9810   0.9792   0.9935
## Prevalence             0.2845   0.1935   0.1744   0.1639   0.1838
## Detection Rate         0.2832   0.1665   0.1588   0.1463   0.1784
## Detection Prevalence   0.3102   0.1751   0.1812   0.1524   0.1810
## Balanced Accuracy      0.9789   0.9248   0.9418   0.9427   0.9839
Test2 <- Test %>% select(fc,problem_id)
xTest <- Test2 %>% select(fc)
result <- data.frame("problem_id" = Test$problem_id,"PREDICTION_RF"=predict(RF_m,xTest),"PREDICTION_GBM"=predict(GBM_m,xTest),"PREDICTION_SVM"=predict(SVM_m,xTest))
result
##    problem_id PREDICTION_RF PREDICTION_GBM PREDICTION_SVM
## 1           1             E              E              C
## 2           2             A              E              A
## 3           3             A              D              B
## 4           4             E              E              C
## 5           5             A              A              A
## 6           6             E              D              A
## 7           7             E              E              A
## 8           8             B              D              A
## 9           9             A              A              B
## 10         10             E              E              E
## 11         11             A              E              B
## 12         12             A              D              A
## 13         13             B              D              E
## 14         14             A              D              B
## 15         15             E              E              A
## 16         16             E              E              A
## 17         17             E              E              C
## 18         18             B              E              A
## 19         19             E              E              A
## 20         20             E              E              E